R = matrix(cbind(1, 0.8, -0.5, 0, 0.8, 1, -0.3, 0.3, -0.5, -0.3, 1, 0.6, 0,
0.3, 0.6, 1), nrow = 4)
n <- 1000
k <- 4
M <- NULL
V <- NULL
mu <- c(15, 40, 5, 23) # vector of variable means
s <- c(5, 20, 4, 15) # vector of variable SDs
for (i in 1:k) {
V <- rnorm(n, mu[i], s[i])
M <- cbind(M, V)
}
M <- matrix(M, nrow = n, ncol = k)
orig <- as.data.frame(M)
names(orig) = c("Y", "X1", "X2", "X3")
head(orig)
## Y X1 X2 X3
## 1 10.05232 24.23691 12.833636 -15.902132
## 2 13.60772 33.48114 7.002115 39.440165
## 3 12.27792 42.10686 14.825860 11.298098
## 4 17.93078 31.17851 3.524137 40.305653
## 5 21.22642 19.19276 5.560270 0.362382
## 6 16.87792 16.23798 6.220589 47.502624
U = chol(R)
newM = M %*% U
new = as.data.frame(newM)
names(new) = c("Y", "X1", "X2", "X3")
cor(new) # note that is correlation matrix is what we are aiming for!
## Y X1 X2 X3
## Y 1.000000000 0.7958684 -0.4943766 0.002613573
## X1 0.795868402 1.0000000 -0.2756530 0.313944956
## X2 -0.494376644 -0.2756530 1.0000000 0.598618032
## X3 0.002613573 0.3139450 0.5986180 1.000000000
plot(orig)
plot(new)
df <- sweep(new, 2, STATS = sds, FUN = "*") # scale back out to original mean...
df <- sweep(df, 2, STATS = ms, FUN = "+") # and standard deviation
head(df)
## Y X1 X2 X3
## 1 10.05232 14.32967 13.201489 10.880472
## 2 13.60772 31.60467 7.054921 35.300357
## 3 12.27792 32.39339 14.563374 39.068799
## 4 17.93078 44.48391 2.187167 27.032812
## 5 21.22642 48.16405 2.109246 2.417693
## 6 16.87792 32.04645 4.413026 31.856923
cor(df)
## Y X1 X2 X3
## Y 1.000000000 0.7958684 -0.4943766 0.002613573
## X1 0.795868402 1.0000000 -0.2756530 0.313944956
## X2 -0.494376644 -0.2756530 1.0000000 0.598618032
## X3 0.002613573 0.3139450 0.5986180 1.000000000
plot(df)
library(ggplot2)
require(gridExtra)
## Loading required package: gridExtra
g1 <- ggplot(data = df, aes(x = X1, y = Y)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)
g2 <- ggplot(data = df, aes(x = X2, y = Y)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)
g3 <- ggplot(data = df, aes(x = X3, y = Y)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)
grid.arrange(g1, g2, g3, ncol = 3)
m1 <- lm(data = df, formula = Y ~ X1)
summary(m1)
##
## Call:
## lm(formula = Y ~ X1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.917 -2.095 0.147 1.990 8.384
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.171377 0.209183 34.28 <2e-16 ***
## X1 0.194804 0.004691 41.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.909 on 998 degrees of freedom
## Multiple R-squared: 0.6334, Adjusted R-squared: 0.633
## F-statistic: 1724 on 1 and 998 DF, p-value: < 2.2e-16
m2 <- lm(data = df, formula = Y ~ X2)
summary(m2)
##
## Call:
## lm(formula = Y ~ X2, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7947 -2.8163 -0.0116 2.7366 13.4699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.87902 0.20882 85.62 <2e-16 ***
## X2 -0.58148 0.03236 -17.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.176 on 998 degrees of freedom
## Multiple R-squared: 0.2444, Adjusted R-squared: 0.2437
## F-statistic: 322.8 on 1 and 998 DF, p-value: < 2.2e-16
m3 <- lm(data = df, formula = Y ~ X3)
summary(m3)
##
## Call:
## lm(formula = Y ~ X3, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.2551 -3.3741 -0.0604 3.3342 16.0607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.953334 0.279693 53.463 <2e-16 ***
## X3 0.000831 0.010064 0.083 0.934
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.805 on 998 degrees of freedom
## Multiple R-squared: 6.831e-06, Adjusted R-squared: -0.0009952
## F-statistic: 0.006817 on 1 and 998 DF, p-value: 0.9342
m <- lm(data = df, formula = Y ~ X1 + X2 + X3)
coef(m)
## (Intercept) X1 X2 X3
## 9.5023647 0.1887012 -0.2539637 -0.0350254
summary(m)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0153 -1.8111 0.0204 1.7561 8.8276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.502365 0.242499 39.185 < 2e-16 ***
## X1 0.188701 0.005452 34.609 < 2e-16 ***
## X2 -0.253964 0.031055 -8.178 8.73e-16 ***
## X3 -0.035025 0.008499 -4.121 4.08e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.545 on 996 degrees of freedom
## Multiple R-squared: 0.72, Adjusted R-squared: 0.7192
## F-statistic: 853.8 on 3 and 996 DF, p-value: < 2.2e-16
# let's check if our residuals are random normal...
plot(fitted(m), residuals(m))
#### You want your fitted model and residuals plot to show a random distribution. This means that all the other variation that your model is not accounting for is random (and not capable of being explained by some other non-accounted for variable).
hist(residuals(m))
qqnorm(residuals(m))
#### What does this output tell us? First off, the results of the omnibus F test tells us that the overall model is significant; using these three variables, we explain signficantly more of the variation in the response variable, Y, than we would using a model with just an intercept, i.e., just that Y = mean(Y).
f <- (summary(m)$r.squared * (nrow(df) - (ncol(df) - 1) - 1))/((1 - summary(m)$r.squared) *
(ncol(df) - 1))
f
## [1] 853.8013
library(curl)
f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN597_Fall17/zombies.csv")
z <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = TRUE)
head(z)
## id first_name last_name gender height weight zombies_killed
## 1 1 Sarah Little Female 62.88951 132.0872 2
## 2 2 Mark Duncan Male 67.80277 146.3753 5
## 3 3 Brandon Perez Male 72.12908 152.9370 1
## 4 4 Roger Coleman Male 66.78484 129.7418 5
## 5 5 Tammy Powell Female 64.71832 132.4265 4
## 6 6 Anthony Green Male 71.24326 152.5246 1
## years_of_education major age
## 1 1 medicine/nursing 17.64275
## 2 3 criminal justice administration 22.58951
## 3 1 education 21.91276
## 4 6 energy studies 18.19058
## 5 3 logistics 21.10399
## 6 4 energy studies 21.48355
m <- lm(data = z, height ~ weight + age)
summary(m)
##
## Call:
## lm(formula = height ~ weight + age, data = z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2278 -1.1782 -0.0574 1.1566 5.4117
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.763388 0.470797 67.47 <2e-16 ***
## weight 0.163107 0.002976 54.80 <2e-16 ***
## age 0.618270 0.018471 33.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.64 on 997 degrees of freedom
## Multiple R-squared: 0.8555, Adjusted R-squared: 0.8553
## F-statistic: 2952 on 2 and 997 DF, p-value: < 2.2e-16
library(car)
m <- lm(data = z, formula = height ~ gender + age)
summary(m)
##
## Call:
## lm(formula = height ~ gender + age, data = z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1909 -1.7173 0.1217 1.7670 7.6746
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.74251 0.56869 82.19 <2e-16 ***
## genderMale 4.00224 0.16461 24.31 <2e-16 ***
## age 0.94091 0.02777 33.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.603 on 997 degrees of freedom
## Multiple R-squared: 0.6361, Adjusted R-squared: 0.6354
## F-statistic: 871.5 on 2 and 997 DF, p-value: < 2.2e-16
m.aov <- Anova(m, type = "II")
m.aov # Use Type II Error
## Anova Table (Type II tests)
##
## Response: height
## Sum Sq Df F value Pr(>F)
## gender 4003.9 1 591.15 < 2.2e-16 ***
## age 7775.6 1 1148.01 < 2.2e-16 ***
## Residuals 6752.7 997
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fitted(m), residuals(m))
hist(residuals(m))
qqnorm(residuals(m))
#### To visualize it…
library(ggplot2)
p <- ggplot(data = z, aes(x = age, y = height)) + geom_point(aes(color = factor(gender))) +
scale_color_manual(values = c("goldenrod", "blue"))
p <- p + geom_abline(slope = m$coefficients[3], intercept = m$coefficients[1],
color = "goldenrod4")
p <- p + geom_abline(slope = m$coefficients[3], intercept = m$coefficients[1] +
m$coefficients[2], color = "darkblue")
p
#### Using the confint() function on our ANCOVA model results reveals the confidence intervals for each of the coefficients in our multiple regression, just as it did for simple regression.
m <- lm(data = z, formula = height ~ age + gender)
summary(m)
##
## Call:
## lm(formula = height ~ age + gender, data = z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1909 -1.7173 0.1217 1.7670 7.6746
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.74251 0.56869 82.19 <2e-16 ***
## age 0.94091 0.02777 33.88 <2e-16 ***
## genderMale 4.00224 0.16461 24.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.603 on 997 degrees of freedom
## Multiple R-squared: 0.6361, Adjusted R-squared: 0.6354
## F-statistic: 871.5 on 2 and 997 DF, p-value: < 2.2e-16
confint(m, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 45.6265330 47.8584809
## age 0.8864191 0.9954081
## genderMale 3.6792172 4.3252593
m <- lm(data = z, height ~ age + gender + age:gender) # or
summary(m)
##
## Call:
## lm(formula = height ~ age + gender + age:gender, data = z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7985 -1.6973 0.1189 1.7662 7.9473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.18107 0.79839 60.348 <2e-16 ***
## age 0.86913 0.03941 22.053 <2e-16 ***
## genderMale 1.15975 1.12247 1.033 0.3018
## age:genderMale 0.14179 0.05539 2.560 0.0106 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.595 on 996 degrees of freedom
## Multiple R-squared: 0.6385, Adjusted R-squared: 0.6374
## F-statistic: 586.4 on 3 and 996 DF, p-value: < 2.2e-16
m <- lm(data = z, height ~ age * gender)
summary(m)
##
## Call:
## lm(formula = height ~ age * gender, data = z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7985 -1.6973 0.1189 1.7662 7.9473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.18107 0.79839 60.348 <2e-16 ***
## age 0.86913 0.03941 22.053 <2e-16 ***
## genderMale 1.15975 1.12247 1.033 0.3018
## age:genderMale 0.14179 0.05539 2.560 0.0106 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.595 on 996 degrees of freedom
## Multiple R-squared: 0.6385, Adjusted R-squared: 0.6374
## F-statistic: 586.4 on 3 and 996 DF, p-value: < 2.2e-16
coefficients(m)
## (Intercept) age genderMale age:genderMale
## 48.1810741 0.8691284 1.1597481 0.1417928
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN597_Fall17/KamilarAndCooperData.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = TRUE)
head(d)
## Scientific_Name Family Genus Species
## 1 Allenopithecus_nigroviridis Cercopithecidae Allenopithecus nigroviridis
## 2 Allocebus_trichotis Cercopithecidae Allocebus trichotis
## 3 Alouatta_belzebul Atelidae Alouatta belzebul
## 4 Alouatta_caraya Atelidae Alouatta caraya
## 5 Alouatta_guariba Atelidae Alouatta guariba
## 6 Alouatta_palliata Atelidae Alouatta palliata
## Brain_Size_Species_Mean Brain_Size_Female_Mean Brain_size_Ref
## 1 58.02 53.70 Isler et al 2008
## 2 NA NA
## 3 52.84 51.19 Isler et al 2008
## 4 52.63 47.80 Isler et al 2008
## 5 51.70 49.08 Isler et al 2008
## 6 49.88 48.04 Isler et al 2008
## Body_mass_male_mean Body_mass_female_mean Mass_Dimorphism
## 1 6130 3180 1.928
## 2 92 84 1.095
## 3 7270 5520 1.317
## 4 6525 4240 1.539
## 5 5800 4550 1.275
## 6 7150 5350 1.336
## Mass_Ref MeanGroupSize AdultMales AdultFemale
## 1 Isler et al 2008 NA NA NA
## 2 Smith and Jungers 1997 1.00 1.00 1.0
## 3 Isler et al 2008 7.00 1.00 1.0
## 4 Isler et al 2008 8.00 2.30 3.3
## 5 Isler et al 2008 6.53 1.37 2.2
## 6 Isler et al 2008 12.00 2.90 6.3
## AdultSexRatio
## 1 NA
## 2 NA
## 3 1.00
## 4 1.43
## 5 1.61
## 6 2.17
## Social_Organization_Ref
## 1
## 2 Kappeler 1997
## 3 Campbell et al 2007
## 4 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
## 5 Campbell et al 2007
## 6 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
## InterbirthInterval_d Gestation WeaningAge_d MaxLongevity_m LitterSz
## 1 NA NA 106.15 276.0 1.01
## 2 NA NA NA NA 1.00
## 3 NA NA NA NA NA
## 4 337.62 187 323.16 243.6 1.01
## 5 NA NA NA NA NA
## 6 684.37 186 495.60 300.0 1.02
## Life_History_Ref GR_MidRangeLat_dd Precip_Mean_mm Temp_Mean_degC
## 1 Jones et al. 2009 -0.17 1574.0 25.2
## 2 -16.59 1902.3 20.3
## 3 -6.80 1643.5 24.9
## 4 Jones et al. 2009 -20.34 1166.4 22.9
## 5 -21.13 1332.3 19.6
## 6 Jones et al. 2009 6.95 1852.6 23.7
## AET_Mean_mm PET_Mean_mm Climate_Ref HomeRange_km2
## 1 1517.8 1589.4 Jones et al. 2009 NA
## 2 1388.2 1653.7 Jones et al. 2009 NA
## 3 1286.6 1549.8 Jones et al. 2009 NA
## 4 1193.1 1404.9 Jones et al. 2009 NA
## 5 1225.7 1332.2 Jones et al. 2009 0.03
## 6 1300.0 1633.9 Jones et al. 2009 0.19
## HomeRangeRef DayLength_km DayLengthRef Territoriality Fruit
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA 57.3
## 4 0.40 Nunn et al. 2003 NA 23.8
## 5 Jones et al. 2009 NA NA 5.2
## 6 Jones et al. 2009 0.32 Nunn et al. 2003 0.6506 33.1
## Leaves Fauna DietRef1 Canine_Dimorphism
## 1 2.210
## 2 NA
## 3 19.1 0.0 Campbell et al. 2007 1.811
## 4 67.7 0.0 Campbell et al. 2007 1.542
## 5 73.0 0.0 Campbell et al. 2007 1.783
## 6 56.4 0.0 Campbell et al. 2007 1.703
## Canine_Dimorphism_Ref Feed Move Rest Social Activity_Budget_Ref
## 1 Plavcan & Ruff 2008 NA NA NA NA
## 2 NA NA NA NA
## 3 Plavcan & Ruff 2008 13.75 18.75 57.30 10.00 Campbell et al. 2007
## 4 Plavcan & Ruff 2008 15.90 17.60 61.60 4.90 Campbell et al. 2007
## 5 Plavcan & Ruff 2008 18.33 14.33 64.37 3.00 Campbell et al. 2007
## 6 Plavcan & Ruff 2008 17.94 12.32 66.14 3.64 Campbell et al. 2007
d <- select(d, Brain_Size_Female_Mean, Family, Body_mass_female_mean, MeanGroupSize,
DayLength_km, HomeRange_km2, Move)
m <- lm(data = d, log(HomeRange_km2) ~ log(Body_mass_female_mean) + log(Brain_Size_Female_Mean) +
MeanGroupSize + Move)
summary(m)
##
## Call:
## lm(formula = log(HomeRange_km2) ~ log(Body_mass_female_mean) +
## log(Brain_Size_Female_Mean) + MeanGroupSize + Move, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7978 -0.6473 -0.0038 0.8807 2.1598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.955853 1.957472 -3.553 0.000865 ***
## log(Body_mass_female_mean) 0.315276 0.468439 0.673 0.504153
## log(Brain_Size_Female_Mean) 0.614460 0.591100 1.040 0.303771
## MeanGroupSize 0.034026 0.009793 3.475 0.001095 **
## Move 0.025916 0.019559 1.325 0.191441
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.127 on 48 degrees of freedom
## (160 observations deleted due to missingness)
## Multiple R-squared: 0.6359, Adjusted R-squared: 0.6055
## F-statistic: 20.95 on 4 and 48 DF, p-value: 4.806e-10
plot(m$residuals)
qqnorm(m$residuals)
#### Test for normality
shapiro.test(m$residuals)
##
## Shapiro-Wilk normality test
##
## data: m$residuals
## W = 0.96517, p-value = 0.1242
m <- lm(data = d, log(HomeRange_km2) ~ log(Body_mass_female_mean) + log(Brain_Size_Female_Mean) +
MeanGroupSize)
summary(m)
##
## Call:
## lm(formula = log(HomeRange_km2) ~ log(Body_mass_female_mean) +
## log(Brain_Size_Female_Mean) + MeanGroupSize, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7982 -0.7310 -0.0140 0.8386 3.1926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.354845 1.176046 -4.553 1.45e-05 ***
## log(Body_mass_female_mean) -0.181627 0.311382 -0.583 0.560972
## log(Brain_Size_Female_Mean) 1.390536 0.398840 3.486 0.000721 ***
## MeanGroupSize 0.030433 0.008427 3.611 0.000473 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.228 on 103 degrees of freedom
## (106 observations deleted due to missingness)
## Multiple R-squared: 0.6766, Adjusted R-squared: 0.6672
## F-statistic: 71.85 on 3 and 103 DF, p-value: < 2.2e-16
plot(m$residuals)
qqnorm(m$residuals)
shapiro.test(m$residuals) # no significant deviation from normal
##
## Shapiro-Wilk normality test
##
## data: m$residuals
## W = 0.99366, p-value = 0.9058